Age of U.S. Presidents

# recent versions of tidyverse include "lubridate" (for working with dates)
library(tidyverse)  # ecosystem of data science packages
library(rvest)      # for web scraping

1 About

Median age for U.S. presidents at inauguration


1.1 Details

  • Topic(s): Politics, U.S., Age of presidents
  • Data:
    • Size: small data
    • Raw data in HTML table (involves basic web scraping)
    • Requires some cleansing (e.g. regex)
    • Requires soft-coding (e.g. create vector for party)
  • Graphic:
    • Type: scatterplot
    • Interactive: no, but we can use "plotly" and "ggiraph"



2 Description

The data visualization of this module is based on the following post/article (by Katherine Schaeffer) of Pew Research Center (Oct 10, 2023):

Most U.S. presidents have been in their 50s at inauguration

https://www.pewresearch.org/short-reads/2023/10/10/most-us-presidents-have-been-in-their-50s-at-inauguration/

2.1 Other Sources

Related to the above post, you can find similar sources from The New York Times and Wikipedia.

NYT: World Leader Ages

https://www.nytimes.com/2020/07/16/opinion/america-presidents-old-age.html

https://www.nytimes.com/2020/10/22/learning/whats-going-on-in-this-graph-world-leader-ages.html

Wikipedia: List of presidents of the United States by age

https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States_by_age

Wikipedia: List of presidents of the United States

https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States


3 Data

The data for ages of US presidents is available in Wikipedia:

https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States_by_age

Specifically, the data is in the HTML table “Presidential age-related data” of the Wikipedia webpage.

3.1 Download Data

For data archival purposes, it’s a good idea to download the HTML page containing the table with the data of interest: “Presidential age-related data”

wiki = "https://en.wikipedia.org/wiki/"
presidents = "List_of_presidents_of_the_United_States_by_age"

wiki_url = paste0(wiki, presidents)

download.file(wiki_url, destfile = paste0(presidents, ".html"))

3.2 Import Data

You can import the data directly from the Wikipedia webpage (via its URL), or by reading-in the downloaded HTML file. In both cases we can read-in the HTML page with the function read_html() from the package "rvest"

# import directly from wikipedia
wiki = "https://en.wikipedia.org/wiki/"
presidents = "List_of_presidents_of_the_United_States_by_age"

wiki_url = paste0(wiki, presidents)

doc = read_html(wiki_url)

Because we have a local copy of the HTML page, we prefer this option:

# import from downloaded HTML file
doc = read_html("List_of_presidents_of_the_United_States_by_age.html")

The function read_html() extracts ALL tables in an HTML document, and returns them in an R list.

The table we are interested in—Presidential age-related data—is the first one in the list which we can select with the following command:

tbls = html_table(doc)

tbl = tbls[[1]]

# get rid of first row
tbl = tbl[-1, ]
tbl
# A tibble: 47 × 8
   No.   President           Born  Age atstart of presi…¹ Age atend of preside…²
   <chr> <chr>               <chr> <chr>                  <chr>                 
 1 1     George Washington   Feb … 57 years, 67 daysApr … 65 years, 10 daysMar …
 2 2     John Adams          Oct … 61 years, 125 daysMar… 65 years, 125 daysMar…
 3 3     Thomas Jefferson    Apr … 57 years, 325 daysMar… 65 years, 325 daysMar…
 4 4     James Madison       Mar … 57 years, 353 daysMar… 65 years, 353 daysMar…
 5 5     James Monroe        Apr … 58 years, 310 daysMar… 66 years, 310 daysMar…
 6 6     John Quincy Adams   Jul … 57 years, 236 daysMar… 61 years, 236 daysMar…
 7 7     Andrew Jackson      Mar … 61 years, 354 daysMar… 69 years, 354 daysMar…
 8 8     Martin Van Buren    Dec … 54 years, 89 daysMar … 58 years, 89 daysMar …
 9 9     William Henry Harr… Feb … 68 years, 23 daysMar … 68 years, 54 daysApr …
10 10    John Tyler          Mar … 51 years, 6 daysApr 4… 54 years, 340 daysMar…
# ℹ 37 more rows
# ℹ abbreviated names: ¹​`Age atstart of presidency`, ²​`Age atend of presidency`
# ℹ 3 more variables: `Post-presidencytimespan` <chr>, Lifespan <chr>,
#   Lifespan <chr>

3.3 Data Preparation

Here’s the list of (somewhat advanced) steps to do further manipulation so that we have the necessary data for ggplot.

# age at start of presidential period
age_start = as.numeric(str_extract(tbl$`Age atstart of presidency`, "^\\d+"))

# age at the end of presidential period
age_end = as.numeric(str_extract(tbl$`Age atend of presidency`, "^\\d+"))

# Donald Trump's current age
age_end[47] = 78

year_start = as.numeric(str_extract(tbl$`Age atstart of presidency`, "\\d+$"))

year_end = as.numeric(str_extract(tbl$`Age atend of presidency`, "\\d{4}"))
year_end[47] = 2025

3.3.1 Adding Party

party = c(
  "Other",      # "Washington"
  "Federalist", # "Adams"     
  "Federalist", # "Jefferson" 
  "Federalist", # "Madison"   
  "Federalist", # "Monroe"    
  "Federalist", # "Adams"     
  "Democratic", # "Jackson"   
  "Democratic", # "Buren"     
  "Whig",       # "Harrison"  
  "Whig",       # "Tyler"     
  "Democratic", # "Polk"      
  "Whig",       # "Taylor"    
  "Whig",       # "Fillmore"  
  "Democratic", # "Pierce"    
  "Democratic", # "Buchanan"  
  "Republican", # "Lincoln"   
  "Democratic", # "Johnson"   
  "Republican", # "Grant"     
  "Republican", # "Hayes"     
  "Republican", # "Garfield"  
  "Republican", # "Arthur"    
  "Democratic", # "Cleveland" 
  "Republican", # "Harrison"  
  "Democratic", # "Cleveland" 
  "Republican", # "McKinley"  
  "Republican", # "Roosevelt" 
  "Republican", # "Taft"      
  "Democratic", # "Wilson"    
  "Republican", # "Harding"   
  "Republican", # "Coolidge"  
  "Republican", # "Hoover"    
  "Democratic", # "Roosevelt" 
  "Democratic", # "Truman"    
  "Republican", # "Eisenhower"
  "Democratic", # "Kennedy"   
  "Democratic", # "Johnson"   
  "Republican", # "Nixon"     
  "Republican", # "Ford"      
  "Democratic", # "Carter"    
  "Republican", # "Reagan"    
  "Republican", # "Bush"      
  "Democratic", # "Clinton"   
  "Republican", # "Bush"      
  "Democratic", # "Obama"     
  "Republican", # "Trump"     
  "Democratic", # "Biden"
  "Republican"  # Trump
)

3.3.2 Clean Table

Having all the ingredients in place we proceed to assemble a “tidy” table (this can be a "tibble" or also a "data.frame")

dat = tibble(
  "president" = tbl$President, 
  "party" = party,
  "age_start" = age_start, 
  "age_end" = age_end,
  "year_start" = year_start,
  "year_end" = year_end)

dat = dat |>
  mutate(color = case_when(
    party == "Democratic" ~ "#468BFA",
    party == "Republican" ~ "#FC4E4B",
    .default = "#FFD438"
  ))

dat
# A tibble: 47 × 7
   president              party      age_start age_end year_start year_end color
   <chr>                  <chr>          <dbl>   <dbl>      <dbl>    <dbl> <chr>
 1 George Washington      Other             57      65       1789     1797 #FFD…
 2 John Adams             Federalist        61      65       1797     1801 #FFD…
 3 Thomas Jefferson       Federalist        57      65       1801     1809 #FFD…
 4 James Madison          Federalist        57      65       1809     1817 #FFD…
 5 James Monroe           Federalist        58      66       1817     1825 #FFD…
 6 John Quincy Adams      Federalist        57      61       1825     1829 #FFD…
 7 Andrew Jackson         Democratic        61      69       1829     1837 #468…
 8 Martin Van Buren       Democratic        54      58       1837     1841 #468…
 9 William Henry Harrison Whig              68      68       1841     1841 #FFD…
10 John Tyler             Whig              51      54       1841     1845 #FFD…
# ℹ 37 more rows

4 Graphs

As usual, let’s go over a series of plotting rounds, starting with a default bar chart, and then gradually adding more elements, and customizing its appearance to get as close as possible to our target visualization.

4.1 Version 1

We use 2 geom_point() layers since there are 2 kinds of points: the starting age-year, and the ending age-year.

# 1st attempt
ggplot(dat) +
  geom_point(aes(x = year_start, y = age_start)) +
  geom_point(aes(x = year_end, y = age_end))

4.2 Version 2

Adding color to points

# 2nd attempt
ggplot(dat) +
  geom_point(aes(x = year_start, y = age_start), color = dat$color) +
  geom_point(aes(x = year_end, y = age_end), color = dat$color)

4.3 Version 3

Adding median age line

# 3rd attempt
ggplot(dat) +
  geom_hline(yintercept = median(dat$age_start)) +
  geom_point(aes(x = year_start, y = age_start), color = dat$color) +
  geom_point(aes(x = year_end, y = age_end), color = dat$color) 

4.4 Version 4

# 4th attempt
ggplot(dat) +
  geom_hline(yintercept = seq(30, 90, by = 10), linetype = "dotted") +
  geom_hline(yintercept = median(dat$age_start)) +
  geom_point(aes(x = year_start, y = age_start), color = dat$color) +
  geom_point(aes(x = year_end, y = age_end), color = dat$color) +
  scale_y_continuous(breaks = seq(30, 90, by = 10))

4.5 Version 5

# 5th attempt
ggplot(dat) +
  geom_hline(yintercept = seq(30, 90, by = 10), linetype = "dotted") +
  geom_hline(yintercept = median(dat$age_start)) +
  geom_point(aes(x = year_start, y = age_start), color = dat$color) +
  geom_point(aes(x = year_end, y = age_end), color = dat$color) +
  geom_segment(aes(x = year_start, y = age_start, xend = year_end, yend = age_end)) +
  scale_y_continuous(breaks = seq(30, 90, by = 10))

4.6 Version 6

# 6th attempt
ggplot(dat) +
  geom_hline(yintercept = seq(30, 90, by = 10), linetype = "dotted") +
  geom_hline(yintercept = median(dat$age_start)) +
  geom_point(aes(x = year_start, y = age_start), color = dat$color, shape = 21) +
  geom_point(aes(x = year_end, y = age_end), color = dat$color, shape = 21) +
  geom_segment(aes(x = year_start, y = age_start, xend = year_end, yend = age_end),
               linewidth = 1.5, color = dat$color, alpha = 0.5) +
  scale_y_continuous(breaks = seq(30, 90, by = 10))

4.7 Version 7

# 7th attempt
ggplot(dat) +
  geom_hline(yintercept = seq(30, 90, by = 10), linetype = "dotted") +
  geom_hline(yintercept = median(dat$age_start)) +
  geom_point(aes(x = year_start, y = age_start), color = dat$color, shape = 21) +
  geom_point(aes(x = year_end, y = age_end), color = dat$color, shape = 21) +
  geom_segment(aes(x = year_start, y = age_start, xend = year_end, yend = age_end),
               linewidth = 1.5, color = dat$color, alpha = 0.5) +
  geom_text(aes(x = year_start, y = age_start,
                label = str_extract(president, "\\w+$")), size = 2.5, color = dat$color) +
  scale_y_continuous(breaks = seq(30, 90, by = 10))

4.8 Version 8

# 8th attempt
ggplot(dat) +
  geom_hline(yintercept = seq(30, 90, by = 10), linetype = "dotted") +
  geom_hline(yintercept = median(dat$age_start)) +
  geom_point(aes(x = year_start, y = age_start), color = dat$color, shape = 21) +
  geom_point(aes(x = year_end, y = age_end), color = dat$color, shape = 21) +
  geom_segment(aes(x = year_start, y = age_start, xend = year_end, yend = age_end),
               linewidth = 1.5, color = dat$color, alpha = 0.5) +
  geom_text(aes(x = year_end, y = age_end, label = str_extract(president, "\\w+$")),
            size = 2.5, color = dat$color, vjust = -0.5) +
  scale_y_continuous(breaks = seq(30, 90, by = 10))

4.9 Final attempt

# 9th attempt
ggplot(dat) +
  geom_hline(yintercept = seq(30, 90, by = 10), 
             linetype = "dotted", color = "gray70") +
  geom_hline(yintercept = median(dat$age_start), linewidth = 0.25) +
  geom_segment(aes(x = year_start, y = age_start, xend = year_end, yend = age_end),
               linewidth = 1.5, color = dat$color, alpha = 0.5) +
  geom_point(aes(x = year_start, y = age_start), 
             color = dat$color, shape = 21, fill = "white") +
  geom_point(aes(x = year_end, y = age_end), 
             color = dat$color, shape = 21, fill = "white") +
  geom_text(aes(x = year_end, y = age_end, label = str_extract(president, "\\w+$")),
            size = 2.5, color = dat$color, vjust = -0.5) +
  scale_y_continuous(breaks = seq(30, 90, by = 10)) +
  scale_x_continuous(breaks = seq(1780, 2024, by = 12)) +
  labs(title = "Ages of U.S. presidents, 1789-2025",
       subtitle = "The median age at first inauguration is 55 years old",
       x = "",
       y = "age") +
  theme(axis.line = element_blank(),
        panel.background = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank())